Main Protocol
(1) Creating GWAS gene list
First, we generate a list of genes implicated by the GWAS we loaded in Before you Begin step (1). In the case of the GWAS we used here as an example, fine mapping was performed to identify a subset of significantly associated SNPs that are indepedent associations and lead SNPs, so here, we'll read in the lead SNP file and use it to subset the full summary statistics for the study. In the original publication, we carried out this procedure for multiple GWAS for bone mineral density that use different measurement modalities, e.g. DEXA scanning and ultrasound-based measurements.
Generating GWAS regions
Regions are degined by the set of SNPs in linkage disequilibirum (LD) > 0.7 with each lead GWAS SNP).
## Reading in lead SNPs
dir.create("./data/gwas/")
## Warning in dir.create("./data/gwas/"): './data/gwas' already exists
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5621629/bin/NIHMS73769-supplement-Supplementary_Table_2.xlsx"
download.file(url = url, destfile = "./data/gwas/gwas_bmd_lead_snps.xlsx")
lead_snps = read_excel("./data/gwas/gwas_bmd_lead_snps.xlsx", sheet = 1, skip = 3)
bmd_snps <- subset(gwas_sumstat, gwas_sumstat$SNP %in% lead_snps$RSID)
# Some of the variants are not SNPs, but indels. LDLink recognizes them by a different naming convention, so we have to modify their identifiers for LDLink to find their proxies
bmd_snps = bmd_snps %>%
mutate(SNP = word(bmd_snps$SNP,1,sep = "_"))
bmd_snps$SNP[grep(":", bmd_snps$SNP)]
## [1] "1:22483649" "10:29087203" "17:27961561" "22:19677948"
bmd_snps$SNP[6] = paste0("chr", bmd_snps$SNP[6])
bmd_snps$SNP[163] = paste0("chr", bmd_snps$SNP[163])
bmd_snps$SNP[260] = paste0("chr", bmd_snps$SNP[260])
bmd_snps$SNP[301] = paste0("chr", bmd_snps$SNP[301])
# Next we batch query LDLink to get all of the proxy SNPs
LDproxy_batch(snp = bmd_snps$SNP, pop = "EUR", r2d = "r2", token = "c0f613f149ab", append = TRUE)
## error: rs191147097 is monoallelic in the EUR population.,
## error: rs149333699 is monoallelic in the EUR population.,
## error: rs184953495 is monoallelic in the EUR population.,
## error: rs10668066 is not a biallelic variant.,
## error: rs143581991 is not in 1000G reference panel.,
## error: rs746652558 is not in 1000G reference panel.,
## error: rs12806687 is not in 1000G reference panel.,
## error: rs72186592 is not in dbSNP build 151.,
## error: rs202234992 is not in 1000G reference panel.,
# some SNPs will return an error in the LD search. We will stil annotate the nearest up and downstream genes from these SNPs, but their window will just be their coordinate location
no_ld_snps = c("rs191147097", "rs149333699", "rs184953495", "rs10668066",
"rs143581991", "chr10:29087203", "rs12806687", "rs72186592", "rs202234992")
# we format these so we can combine them with the LD 0.7 regions below
no_ld_regions = bmd_snps %>%
filter(SNP %in% no_ld_snps) %>%
dplyr::rename(query_snp = SNP) %>%
group_by(query_snp) %>%
summarize(min = min(BP), max = max(BP), chr = paste0("chr",max(CHR)))
# We read back in the results of the LDLink proxy query
proxies = read_tsv("./combined_query_snp_list.txt", skip = 1, col_names = FALSE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_character(),
## X11 = col_character(),
## X12 = col_character()
## )
colnames(proxies) = c("index", "query_snp", "rs_id", "coord", "alleles", "maf", "distance", "d_prime", "r2", "correlated_alleles", "regulome_db", "function")
# now we filter for proxies with R2 greater than 0.7, and for each query_snp we will create an entry in our regions table with the max and min distance from the
ld_regions = proxies %>%
filter(r2 >= 0.7) %>%
separate(col = coord, sep = ":", into = c("chr", "coord")) %>%
group_by(query_snp) %>%
summarize(chr = max(chr), min = as.numeric(min(coord)), max = as.numeric(max(coord)))
# then we combine both sets to get one set of regions of LD >= 0.7 for each GWAS lead SNP
ld07_gwas_regions = bind_rows(no_ld_regions, ld_regions) %>%
dplyr::select(chr, start = min, end = max, query_snp)
# make sure none of the regions have a negative width
for(i in 1:nrow(ld07_gwas_regions)){
if((ld07_gwas_regions[i,3]-ld07_gwas_regions[i,2]) < 0){
print(paste0("swap row #", i))
start = ld07_gwas_regions[i,3]
end = ld07_gwas_regions[i,2]
ld07_gwas_regions[i,3] = end
ld07_gwas_regions[i,2] = start
}
}
## [1] "swap row #46"
Intersection GWAS regions with reference gene file
Next, we want to find the intersection between these regions and the full human geneset. Our gene set was generated from the hg19 reference genome using the UCSC gene tables. It is important to match the coordinates of the genome used in the GWAS study to the gene set
## Reading in UCSC RefSeq gene table ##
url = "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz"
genes = read_tsv(url, col_names = FALSE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_double(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_number(),
## X11 = col_number(),
## X12 = col_double(),
## X13 = col_character(),
## X14 = col_character(),
## X15 = col_character(),
## X16 = col_character()
## )
colnames(genes) = c("bins", "refseq_id", "chr", "strand", "txStart", "txEnd", "cdsStart", "cdsEnd", "exonCount", "ExonStarts", "ExonEnds", "score", "name", "cdsStartStat", "cdsEndStat", "exonFrames")
genes = genes[,c(3,5,6,2,13)]
genes = genes %>%
filter(str_detect(refseq_id, "NM_"))
# check that the column types are correct and chromosome notation matches
head(genes)
head(ld07_gwas_regions)
# Next, we're going to use the Genomic Ranges package to annotate our ranges with genes
gr_genes <- GRanges(genes)
gr_gwas <- GRanges(ld07_gwas_regions)
# We find the genes that overlap the ranges from the GWAS study
overlaps <- GenomicRanges::findOverlaps(gr_genes, gr_gwas)
overlaps_info = cbind(ld07_gwas_regions[overlaps@to,], genes[overlaps@from,]$name)
overlaps_info = overlaps_info[,c(1,4,5)]
overlaps_info$type = "overlaps"
overlaps_info = overlaps_info[!duplicated(overlaps_info), ]
colnames(overlaps_info) = c("chromosome", "gwas_snp", "gene_name", "type")
# We find the nearest genes preceding the ranges from the GWAS study
nearest_precede <- GenomicRanges::precede(gr_gwas, gr_genes)
precede_info = cbind(ld07_gwas_regions, genes[nearest_precede,])
precede_info = precede_info[,c(1,4,9)]
precede_info$type = "precede"
precede_info = precede_info[!duplicated(precede_info), ]
colnames(precede_info) = c("chromosome", "gwas_snp", "gene_name", "type")
precede_info = precede_info[-which(precede_info$gwas_snp %in% overlaps_info$gwas_snp),]
# We find the nearest genes following the ranges from the GWAS study
nearest_follow <- GenomicRanges::follow(gr_gwas, gr_genes)
follow_info = cbind(ld07_gwas_regions, genes[nearest_follow,])
follow_info = follow_info[,c(1,4,9)]
follow_info$type = "follow"
follow_info = follow_info[!duplicated(follow_info), ]
colnames(follow_info) = c("chromosome", "gwas_snp", "gene_name", "type")
follow_info = follow_info[-which(follow_info$gwas_snp %in% overlaps_info$gwas_snp),]
# Finally, we combine these results into one table
gwas_region_genes = rbind(overlaps_info, precede_info, follow_info)
gwas_region_genes
Identify mouse homologs for GWAS genes
Now, we now have a list of human genes that overlap the regions as defined by LD from our study of interest, however if you are using an expression dataset from another species, such as the mouse, you will also need to convert to the appropriate homologs. We can use a homology map to generate a list of mouse homologs for the human gene list from MGI (http://www.informatics.jax.org/faq/ORTH_dload.shtml).
## Reading in homology map ##
homology = read_tsv(url("https://github.com/Farber-Lab/STAR_protocols_core_modules/raw/main/data/homology/HOM_MouseHumanSequence.txt"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## `HomoloGene ID` = col_double(),
## `Common Organism Name` = col_character(),
## `NCBI Taxon ID` = col_double(),
## Symbol = col_character(),
## `EntrezGene ID` = col_double(),
## `Mouse MGI ID` = col_character(),
## `HGNC ID` = col_character(),
## `OMIM Gene ID` = col_character(),
## `Genetic Location` = col_character(),
## `Genomic Coordinates (mouse: , human: )` = col_character(),
## `Nucleotide RefSeq IDs` = col_character(),
## `Protein RefSeq IDs` = col_character(),
## `SWISS_PROT IDs` = col_character()
## )
gwas_gene_hom <- homology %>%
filter(Symbol %in% gwas_region_genes$gene_name)
gwas_mouse_hom <- homology %>%
filter(`HomoloGene ID` %in% gwas_gene_hom$`HomoloGene ID`) %>%
filter(`NCBI Taxon ID` == 10090)
gwas_mouse_genes = gwas_mouse_hom$Symbol
(2) Construct a co-expression network
Next, we apply weighted gene coexpression network analysis to the expression matrix. This section heavily represents the content from the WGCNA tutorials created by the author's of WGCNA, found here: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/
Setting WGCNA options
options(stringsAsFactors = FALSE)
Quality Control of Expression Data
In this section, the samples in the expression matrix are clustered and plotted to identify outliers.
#Using a cluster tree to find sample outliers
sampleTree = hclust(dist(norm_exp_mat), method = "average")
# sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
# None of these samples require removal from the analysis
Power Calculation for Network Construction
Next, the power used for calculating the network is empirically chosen by calculating scale-free topology and mean connectivity. A general heuristic is to construct the network the lowest power at which that scale-free topology index curve flattens out.
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(norm_exp_mat, powerVector = powers, verbose = 5, networkType = "signed")
## pickSoftThreshold: will use block size 1529.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1529 of 29255
## Warning: executing %dopar% sequentially: no parallel backend registered
## ..working on genes 1530 through 3058 of 29255
## ..working on genes 3059 through 4587 of 29255
## ..working on genes 4588 through 6116 of 29255
## ..working on genes 6117 through 7645 of 29255
## ..working on genes 7646 through 9174 of 29255
## ..working on genes 9175 through 10703 of 29255
## ..working on genes 10704 through 12232 of 29255
## ..working on genes 12233 through 13761 of 29255
## ..working on genes 13762 through 15290 of 29255
## ..working on genes 15291 through 16819 of 29255
## ..working on genes 16820 through 18348 of 29255
## ..working on genes 18349 through 19877 of 29255
## ..working on genes 19878 through 21406 of 29255
## ..working on genes 21407 through 22935 of 29255
## ..working on genes 22936 through 24464 of 29255
## ..working on genes 24465 through 25993 of 29255
## ..working on genes 25994 through 27522 of 29255
## ..working on genes 27523 through 29051 of 29255
## ..working on genes 29052 through 29255 of 29255
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.0985 -74.30 0.978 14600.00 14600.00 14900.0
## 2 2 0.3610 -59.50 0.873 7540.00 7530.00 7900.0
## 3 3 0.6250 -41.80 0.890 3990.00 3980.00 4390.0
## 4 4 0.6770 -27.10 0.901 2170.00 2160.00 2550.0
## 5 5 0.7300 -19.30 0.925 1210.00 1200.00 1540.0
## 6 6 0.7800 -14.40 0.945 687.00 678.00 967.0
## 7 7 0.8490 -10.60 0.938 399.00 391.00 632.0
## 8 8 0.9160 -8.79 0.954 237.00 230.00 448.0
## 9 9 0.9600 -7.29 0.968 143.00 138.00 333.0
## 10 10 0.9810 -6.03 0.977 88.20 83.60 258.0
## 11 12 0.9920 -4.38 0.994 35.30 32.30 174.0
## 12 14 0.9910 -3.37 0.994 15.20 13.10 131.0
## 13 16 0.9900 -2.75 0.989 7.06 5.61 105.0
## 14 18 0.9770 -2.36 0.970 3.53 2.50 87.2
## 15 20 0.9710 -2.08 0.964 1.91 1.16 74.1
# Plot the results:
# sizeGrWindow(9, 5)
# par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");abline(h=0.9,col="red")
# this line corresponds to using an R^2 cut-off of h
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"));text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
#### power = 9, lowest power before scale-free topology fit levels off
Calculating the Network
Finally, a signed network is calcuated using the power identified in the power calculation step, a minimum module size of 20 trasnscripts,
dir.create("./data/network/")
## Warning in dir.create("./data/network/"): './data/network' already exists
net = blockwiseModules(norm_exp_mat, power = 9,
TOMType = "signed", minModuleSize = 20,
networkType = "signed",
reassignThreshold = 0, mergeCutHeight = 0.15,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "./data/network/exp_mat_TOM",
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ....pre-clustering genes to determine blocks..
## Projective K-means:
## ..k-means clustering..
## ..merging smaller clusters...
## Block sizes:
## gBlocks
## 1 2 3 4 5 6
## 4996 4980 4964 4880 4739 4696
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file ./data/network/exp_mat_TOM-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 546 genes from module 1 because their KME is too low.
## ..removing 252 genes from module 2 because their KME is too low.
## ..removing 143 genes from module 3 because their KME is too low.
## ..removing 144 genes from module 4 because their KME is too low.
## ..removing 90 genes from module 5 because their KME is too low.
## ..removing 77 genes from module 6 because their KME is too low.
## ..removing 29 genes from module 7 because their KME is too low.
## ..removing 71 genes from module 8 because their KME is too low.
## ..removing 30 genes from module 9 because their KME is too low.
## ..removing 3 genes from module 11 because their KME is too low.
## ..Working on block 2 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 2 into file ./data/network/exp_mat_TOM-block.2.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 513 genes from module 1 because their KME is too low.
## ..removing 217 genes from module 2 because their KME is too low.
## ..removing 304 genes from module 3 because their KME is too low.
## ..removing 173 genes from module 4 because their KME is too low.
## ..removing 75 genes from module 5 because their KME is too low.
## ..removing 18 genes from module 6 because their KME is too low.
## ..Working on block 3 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 3 into file ./data/network/exp_mat_TOM-block.3.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1171 genes from module 1 because their KME is too low.
## ..removing 1055 genes from module 2 because their KME is too low.
## ..Working on block 4 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 4 into file ./data/network/exp_mat_TOM-block.4.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 478 genes from module 1 because their KME is too low.
## ..removing 306 genes from module 2 because their KME is too low.
## ..removing 264 genes from module 3 because their KME is too low.
## ..removing 203 genes from module 4 because their KME is too low.
## ..removing 152 genes from module 5 because their KME is too low.
## ..removing 96 genes from module 6 because their KME is too low.
## ..removing 23 genes from module 7 because their KME is too low.
## ..removing 10 genes from module 8 because their KME is too low.
## ..removing 4 genes from module 9 because their KME is too low.
## ..removing 5 genes from module 10 because their KME is too low.
## ..removing 3 genes from module 11 because their KME is too low.
## ..removing 2 genes from module 12 because their KME is too low.
## ..Working on block 5 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 5 into file ./data/network/exp_mat_TOM-block.5.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 706 genes from module 1 because their KME is too low.
## ..removing 289 genes from module 2 because their KME is too low.
## ..removing 84 genes from module 3 because their KME is too low.
## ..removing 134 genes from module 4 because their KME is too low.
## ..removing 70 genes from module 5 because their KME is too low.
## ..removing 64 genes from module 6 because their KME is too low.
## ..removing 68 genes from module 7 because their KME is too low.
## ..removing 57 genes from module 8 because their KME is too low.
## ..removing 40 genes from module 9 because their KME is too low.
## ..removing 28 genes from module 10 because their KME is too low.
## ..removing 26 genes from module 11 because their KME is too low.
## ..removing 15 genes from module 12 because their KME is too low.
## ..removing 20 genes from module 13 because their KME is too low.
## ..removing 18 genes from module 14 because their KME is too low.
## ..removing 10 genes from module 15 because their KME is too low.
## ..removing 11 genes from module 16 because their KME is too low.
## ..removing 2 genes from module 17 because their KME is too low.
## ..removing 6 genes from module 18 because their KME is too low.
## ..removing 5 genes from module 19 because their KME is too low.
## ..removing 2 genes from module 20 because their KME is too low.
## ..removing 3 genes from module 21 because their KME is too low.
## ..removing 1 genes from module 22 because their KME is too low.
## ..Working on block 6 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 6 into file ./data/network/exp_mat_TOM-block.6.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 982 genes from module 1 because their KME is too low.
## ..removing 665 genes from module 2 because their KME is too low.
## ..removing 160 genes from module 3 because their KME is too low.
## ..removing 193 genes from module 4 because their KME is too low.
## ..removing 100 genes from module 5 because their KME is too low.
## ..removing 9 genes from module 6 because their KME is too low.
## ..removing 7 genes from module 7 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.15
## Calculating new MEs...
table(net$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 10232 1464 1274 1102 918 795 744 711 687 641 636 575 548
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 541 498 492 463 429 406 389 379 305 299 295 271 263
## 26 27 28 29 30 31 32 33 34 35 36 37 38
## 245 220 212 199 183 175 171 167 155 133 120 112 110
## 39 40 41 42 43 44 45 46 47 48 49 50 51
## 110 108 103 88 85 77 75 74 68 67 66 62 61
## 52 53 54 55 56 57 58 59 60 61 62 63 64
## 60 57 57 57 53 53 53 50 46 41 35 31 31
## 65
## 28
Saving the Network
The components of the network required for downstream analyses are saved so the network does not need to be calculated multiple times.
nSamples = 42
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs
dim(MEs)
## [1] 42 66
geneTree = net$dendrograms[[1]]
save(sft, MEs, moduleLabels, moduleColors, geneTree, nSamples,
file = "./data/network/network.RData")
Loading the Network
In order to return to this script and run downstream analyses without recalculating the network, the necessary artifacts can be loaded using this code.
load("./data/network/network.RData")
Annotating the Network
In order to interpret the network it is useful to label the genes in the network with identifiers, e.g. Ensembl gene IDs, entrez IDs, etc. We use the biomaRt package, to map the identifiers to gene symbols. These steps will vary based on the organism you use and the gene identifiers used in the list of relevant phenotype and disease associated genes you generate, examples of which were read in Before you Begin step (5) Curating lists of known phenotype and disease associated genes.
# establish the database that matches your organism
ensembl = useMart("ensembl")
mm19 = useDataset(mart = ensembl, dataset = "mmusculus_gene_ensembl")
# These functions may be used to determine the attributes and filters to apply in the database query
# listAttributes(mm19)
# listFilters(mm19)
# Next, query the BioMart database for annotations for the genes of interest and merge the output of the query with the network results
m = getBM(attributes = c("ensembl_transcript_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position"),
filters = c("ensembl_transcript_id"),
values = colnames(exp_mat),
mart = mm19)
gene_MEs <- rbind(colnames(exp_mat), moduleColors, moduleLabels)
gene_MEs <- t(gene_MEs)
colnames(gene_MEs)[1:3] <- c("ensembl_transcript_id", "mod_color", "mod_number")
gene_MEs <- as.data.frame(gene_MEs)
annotated_mods = merge(gene_MEs, m, by = "ensembl_transcript_id")
# now you can filter annotated_mods to see the genes in each co-expression module
annotated_mods %>%
filter(mod_color == "purple")
(3) Gene Ontology Analysis
In this section, we describe the process of identifying gene ontology processes that are enriched in each coexpression module, and look at an example results table.
While tools exist for performing gene ontology analysis in R, our tool of preference for this project is ToppFun, of the ToppGene suite: https://toppgene.cchmc.org/enrichment.jsp, which is a web interface tool. ToppGene is unique in that it not only searches for enriched gene ontology categories and pathways for your genes of interest, but also human and mouse phenotypes, publications and published coexpression data sets, gene families, microRNAs, drugs, and diseases.
ToppFun reports back the significance of each enrichment with an assortment of multiple testing correction methods, the number of hits for that category from your query ("Hit Count in Query List"), and the total number of genes in the category ("Hit Count in Genome").
# You can extract the gene names for a module by writing the module out to a .csv file which can be fed into external tools, like ToppGene
annotated_mods %>%
filter(mod_number == 1) %>%
write_csv(., "./data/network/mod_1.csv")
# From ToppFun, you can click the "Download All" button from your results page, and get all of the results, so you can save them, browse them in R, and compare different modules in R
dir.create("./data/gene_ontology")
## Warning in dir.create("./data/gene_ontology"): './data/gene_ontology' already
## exists
url = "https://www.cell.com/cms/10.1016/j.celrep.2020.108145/attachment/60cdba8a-aa04-4c22-8ca4-4f8f0e6173dd/mmc3.xlsx"
download.file(url = url, destfile = "./data/gene_ontology/toppfun_res.xlsx")
toppfun_1 = read_excel("./data/gene_ontology/toppfun_res.xlsx", sheet = 1, skip = 1)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in J2086 / R2086C10: got 'NA'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in J2211 / R2211C10: got 'NA'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3600 / R3600C3: got 'Color representing the co-expression
## module used for GO analysis'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3601 / R3601C3: got 'Number of genes in each co-expression
## module'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3602 / R3602C3: got 'Category of query GO term'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3603 / R3603C3: got 'GO ID'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3604 / R3604C3: got 'Query GO term'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3605 / R3605C3: got 'P-value of enrichment'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3606 / R3606C3: got 'Bonferroni corrected p-value'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3607 / R3607C3: got 'Benjamini-Hochberg corrected p-value'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3608 / R3608C3: got 'Benjamini–Yekutieli corrected p-
## value'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3609 / R3609C3: got 'Number of module genes in GO category
## list'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3610 / R3610C3: got 'Number of gene in the GO cateogry'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C3611 / R3611C3: got 'Genes from query list overlapping
## hits'
toppfun_1
(4) Module Enrichment
In this section, we conduct Fisher's Exact tests for module enrichment for GWAS genes, disease genes, and phenotype associated genes and generate tables and plots of the results. In this section, we also read in the GWAS gene list from the orginial publication, which is a composite of the GWAS region analysis across multiple GWAS studies. ### Identify modules enriched for GWAS genes
## Reading in composite GWAS gene list ##
url = "https://www.cell.com/cms/10.1016/j.celrep.2020.108145/attachment/d7558393-85e5-492c-89c3-29660938dd22/mmc4.xlsx"
download.file(url = url, destfile = "./data/gwas/composite_gwas_list.xlsx")
pub_gwas_genes = read_excel("./data/gwas/composite_gwas_list.xlsx", sheet = 1)
pub_gwas_genes = pub_gwas_genes %>%
filter(GWAS_study_first_author %in% c("Estrada, et al.", "Kemp, et al."))
# create list of all modules
colors = unique(moduleColors)
colors = colors[colors != "grey"]
# create object for each module and compute enrichment of overlap between module and GWAS gene list
for (i in 1:length(colors)){
color = colors[i]
matrix_name = paste("mod_",color,"_gene_exp", sep = "")
assign(paste0("mod_",color,"_gene_exp"), subset(gene_MEs, gene_MEs$mod_color == color))
assign(paste0("mod_",color,"_gene_exp"), as.data.frame(get(paste0("mod_",color,"_gene_exp"))))
assign(paste0("mod_",color,"_gene_ids"), rownames(get(paste0("mod_",color,"_gene_exp"))))
assign(paste0("mod_",color,"_trx_info"), annotated_mods %>% filter(mod_color == color))
d = as.data.frame(get(paste0("mod_",color,"_trx_info")))
x = unique(d$external_gene_name) %in% unique(pub_gwas_genes$mouse_gene_name)
x = sum(x == TRUE)
a <- x
b <- (length(unique(pub_gwas_genes$mouse_gene_name)) - a)
c <- (dim(d)[1] - a)
e <- (29255 - c - b - a)
assign(paste0("ft_mod_",color), fisher.test(matrix(c(a,b,c,e),2,2,byrow=TRUE),
alternative='greater'))
}
# format results in table
gwas_enrichment_results = as.data.frame(matrix(nrow=65,ncol=3))
colnames(gwas_enrichment_results) <- c('module_color','p_value','odds_ratio')
for (i in 1:length(colors)){
color = colors[i]
ft = get(paste0("ft_mod_",color))
gwas_enrichment_results[i,1] = color
gwas_enrichment_results[i,2] = ft$p.value
gwas_enrichment_results[i,3] = ft$estimate
}
# Computed FDR values, -log10(p-values), and view results
gwas_enrichment_results$p.adj = p.adjust(gwas_enrichment_results$p_value, method = "fdr", n = length(gwas_enrichment_results$p_value))
gwas_enrichment_results$neg_log10 = -log10(gwas_enrichment_results$p.adj)
gwas_enrichment_results %>% arrange(p.adj)
Plot the module enrichment and significance for GWAS genes
gwas_enrichment_results %>%
ggplot(aes(x = odds_ratio, y = neg_log10)) +
geom_point(aes(color = module_color), size = 4) +
geom_point(color = "black", size = 4, pch = 21) + theme_bw() +
geom_hline(yintercept = -log10(0.05), color = "red") +
scale_colour_identity()
Browse enriched modules
The annotated modules can be retrieved so the genes in the enriched modules can be browsed.
# Filtering annotated modules to browse one module
annotated_mods %>%
filter(mod_color == "purple")
Identify modules enriched for genes associated with monogenic bone disorders
In addition to identifying modules enriched for GWAS genes, we also identify modules enriched for genes known to cause related monogenic diseases.
# First, the human monogenic gene IDs are converted to mouse IDs
mono_gene_hom <- homology %>%
filter(Symbol %in% monogenic_genes)
mono_mouse_hom <- homology %>%
filter(`HomoloGene ID` %in% mono_gene_hom$`HomoloGene ID`) %>%
filter(`NCBI Taxon ID` == 10090)
mono_mouse_genes = mono_mouse_hom$Symbol
# Then the loop is run to identify enriched modules
for (i in 1:length(colors)){
color = colors[i]
d = as.data.frame(get(paste0("mod_",color,"_trx_info")))
x = unique(d$external_gene_name) %in% unique(mono_mouse_genes)
x = sum(x == TRUE)
a <- x
b <- (length(unique(mono_mouse_genes)) - a)
c <- (dim(d)[1] - a)
e <- (29255 - a - b -c)
assign(paste0("ft_mod_",color), fisher.test(matrix(c(a,b,c,e),2,2,byrow=TRUE),
alternative='greater'))
}
# the results table is formatted
mono_results = as.data.frame(matrix(nrow=59,ncol=3))
colnames(mono_results) <- c('module_color','p_value','odds_ratio')
for (i in 1:length(colors)){
color = colors[i]
enrichment = get(paste0("ft_mod_",color))
mono_results[i,1] = color
mono_results[i,2] = enrichment$p.value
mono_results[i,3] = enrichment$estimate
}
mono_results$p.adj = p.adjust(mono_results$p_value, method = "fdr", n = length(mono_results$p_value))
mono_results$neg_log10 = -log10(mono_results$p.adj)
filter(mono_results, mono_results$p.adj < 0.05) %>%
arrange(desc(odds_ratio))
Plot the module enrichment and significance for monogenic disease genes
mono_results %>%
ggplot(aes(x = odds_ratio, y = neg_log10)) +
geom_point(aes(color = module_color), size = 4) +
geom_point(color = "black", size = 4, pch = 21) + theme_bw() +
geom_hline(yintercept = -log10(0.05), color = "red") +
scale_colour_identity()
Identify modules enriched for genes associated with bone phenotypes in model organisms
Finally, we identify modules enriched for genes known to cause related phenotypes in model organisms.
# Then the loop is run to identify modules enriched for phenotype associated genes
for (i in 1:length(colors)){
color = colors[i]
d = as.data.frame(get(paste0("mod_",color,"_trx_info")))
x = unique(d$external_gene_name) %in% unique(phen_genes)
x = sum(x == TRUE)
a <- x
b <- (length(unique(phen_genes)) - a)
c <- (dim(d)[1] - a)
e <- (29255 - a - b -c)
assign(paste0("ft_mod_",color), fisher.test(matrix(c(a,b,c,e),2,2,byrow=TRUE),
alternative='greater'))
}
# the results table is formatted
phen_results = as.data.frame(matrix(nrow=59,ncol=3))
colnames(phen_results) <- c('module_color','p_value','odds_ratio')
for (i in 1:length(colors)){
color = colors[i]
enrichment = get(paste0("ft_mod_",color))
phen_results[i,1] = color
phen_results[i,2] = enrichment$p.value
phen_results[i,3] = enrichment$estimate
}
phen_results = arrange(phen_results, desc(odds_ratio))
phen_results$p.adj = p.adjust(phen_results$p_value, method = "fdr", n = length(phen_results$p_value))
phen_results$neg_log10 = -log10(phen_results$p.adj)
filter(phen_results, phen_results$p.adj < 0.05)
Plot the module enrichment and significance for genes associated with bone phenotype from model systems
phen_results %>%
ggplot(aes(x = odds_ratio, y = neg_log10)) +
geom_point(aes(color = module_color), size = 4) +
geom_point(color = "black", size = 4, pch = 21) + theme_bw() +
geom_hline(yintercept = -log10(0.05), color = "red") +
scale_colour_identity()
(5) LD Score Regression
LD score regression (ldsc package) is used to calculated the partitioned heritability attributed to SNPs surrounding the genes that compose each module. The github repo and the wiki for the ldsc package has detailed tutorials for how to carry out this analysis. It is not wrapped for R, but can be run on the command line using python. The general steps taken are outlined here:
# (1) First, we need to format the GWAS summary statistics for input into the lsdc algorithm
## munge sumstats, returns a file called out.sumstats.gz
# ./munge_sumstats.py \
# --out BMD \
# --merge-alleles w_hm3.snplist \
# --a1-inc \
# --sumstats bmd_gwas_sumstats.txt
# (2) Generate genesets for each module for all chromosomes; this requires a list of gene identifiers for the genes in each module, a file indicated the coordinates for each identifier, and the plink .bim files of pre-computed LD scores. In this application we use the 1000 genomes plink files, the Genesets are listed as Ensembl gene IDs, and the gene coordinate file "ENSG_coord.txt", provided with the ldsc package, and we use a windowsize of 100,000, as recommended by the authors of the application
## make annotations for each module and each chromosome
# python ../../src/ldsc/make_annot.py --gene-set-file violet_module_human_gene_ids.Geneset --gene-coord-file ENSG_coord.txt --windowsize 100000 --bimfile ./1000G_plinkfiles/1000G.mac5eur.1.bim --annot-file ./violet_annot/violet_module.1.annot.gz
# (3) Finally, using all of these annotations, run ldsc using the processed summary statistics, the base annotation paths for the modules' annotations, the SNP weights and frequencies for the European 1000 Genomes data that are provided with the ldsc package, the overlap annotations was used because transcripts were used to generate the co-expression network, so the gene sets are non-disjoint, and finally, a base name for the output is provided.
## compute LDSC, outputs out.log and out.results
# python ldsc.py
# --h2 BMD.sumstats.gz\
# --ref-ld-chr antiquewhite4_module.,
# bisque4_module.,black_module.,
# blue_module., ...etc.\
# --w-ld-chr ./weights_hm3_no_hla/weights.
# --overlap-annot\
# --frqfile-chr 1000G.mac5eur.\
# --out BMD_all_modules_compare
### The output of the last step includes a log, which echos the command used to run the regression and provides a summary of the results, and a results table, which reports the proportion of SNPs, the proportion of heritability, the standard error fo the proportion of heritability, the heritability, the standard error of the heritability, and a p-value indicating the statistical significance of the enrichment
#ldsc_log = read_lines("./data/ld_score_regression/BMD_all_modules_compare.log")
#ldsc_results = read_tsv("./data/ld_score_regression/BMD_all_modules_compare.results")
# The results table has a category for each annotation provided, but they have generic labels. The log keeps track of the annotation input, so we can get the names from the log, because the order of the annotation matters
# Here is an example of table of results from an ldsc run arranged by p-value
dir.create("./data/ld_score_regression")
## Warning in dir.create("./data/ld_score_regression"): './data/
## ld_score_regression' already exists
url = "https://www.cell.com/cms/10.1016/j.celrep.2020.108145/attachment/921fc33d-d674-4488-951f-295320b24cea/mmc5.xlsx"
download.file(url = url, destfile = "./data/ld_score_regression/ld_score_reg_res.xlsx")
ldsc_res = as.data.frame(read_excel("./data/ld_score_regression/ld_score_reg_res.xlsx", sheet = 3))
# A p-value adjustment is also applied to correct for testing across all modules
ldsc_res$padj = p.adjust(ldsc_res$Enrichment_p, method = "fdr", n = length(ldsc_res$Enrichment_p))
ldsc_res$neg_log10 = -log10(ldsc_res$padj)
ldsc_res %>%
dplyr::select(module, Enrichment, padj, neg_log10) %>%
filter(padj < 0.05)
Plot the LD score enrichments for all modules
ldsc_res %>%
ggplot(aes(x = Enrichment, y = neg_log10)) +
geom_point(aes(color = module), size = 4) +
geom_point(color = "black", size = 4, pch = 21) + theme_bw() +
geom_hline(yintercept = -log10(0.05), color = "red") +
scale_colour_identity()
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
(6) Gene-level analysis prioritization
In this section, we calculate each genes' module membership (correlation with the module eigengene) for each module and save the results and a p-value. Quantitatively, the module membership score is the correlation between the module’s eigengene, which describes the collective expression profile of the group of genes, and each individual gene’s expression pattern. Module membership is highly correlated with intramodular connectivity, and thus, intramodular hub genes tend to have high module membership.
# calculate the correlation between the expression profiles and module eigengenes for each pair and the p-value
MMvalue = as.data.frame(cor(norm_exp_mat, MEs, use='p'))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix (MMvalue), nSamples))
# map the names of the modules from numeric to color names
modMap = unique(as.data.frame(cbind(moduleColors, moduleColors, moduleLabels), row.names = NA))
colnames(modMap) = c("MM.p.moduleColors", "MM.moduleColors", "moduleLabels")
modMap$moduleLabels = paste0("ME", modMap$moduleLabels)
modMap$MM.moduleColors = paste0("MM.", modMap$MM.moduleColors)
modMap$MM.p.moduleColors = paste0("MM.p.", modMap$MM.p.moduleColors)
names(MMvalue) = modMap$MM.moduleColors[match(names(MMvalue), modMap$moduleLabels)]
names(MMPvalue) = modMap$MM.p.moduleColors[match(names(MMPvalue), modMap$moduleLabels)]
# moving the transcript IDs as a column
MMvalue$Transcript.ID = rownames(MMvalue)
MMPvalue$Transcript.ID = rownames(MMPvalue)
# the resulting tables have column names of the form MM."module_color" and MM.p."module_color"
MMvalue